#!/bin/bash

europarl=/opt/europarl
tools=~/svn/osdt/trunk/tools

for lang in `echo en da fi` ; do 
	# Create empty train and test files
	train=europarl-$lang-train.txt
	test=europarl-$lang-test.txt
	rm -f $train $test

	# Process all files
	testfiles=`ls $europarl/txt/$lang/*.txt | egrep '-00-(10|11|12)-'`
	trainfiles=`ls $europarl/txt/$lang/*.txt | egrep -v '-00-(10|11|12)-'`
	cat $testfiles | $europarl/tools/split-sentences.perl -l $lang \
			| $tools/remove-xml \
			| $europarl/tools/tokenizer.perl -l $lang > $test
	cat $rainfiles | $europarl/tools/split-sentences.perl -l $lang \
			| $tools/remove-xml \
			| $europarl/tools/tokenizer.perl -l $lang > $train

	# Shuffle files
	shuf $train -o /tmp/train.$$
	shuf $test -o /tmp/test.$$
	mv /tmp/train.$$ $train
	mv /tmp/test.$$ $test

	# Create subcorpora
	for size in `echo 10000 100000 1000000` ; do 
		cat $train | ./subfile $size > $train.$size
		cat $test | ./subfile $size > $test.$size
	done
done

